Load the necessary R libraries
#devtools::install_github("GuangchuangYu/ggtree")
library(treeio)
library(ggtree)
library(phytools)
library(viridisLite)
library(ggplot2)
library(googlesheets4)
library(dplyr)
Load data from google spreadsheet (Already stored in an R object that you can just load)
#id<-"1e7Hj_BQ8rke5XW7j93YL6QVAGbcvsb2yw7oD86WnPJY"
#gsheet<-read_sheet(id,sheet="GlobalOverview",skip = 2,na = c(""," ","NA","#DIV/0!"),col_types = "c",trim_ws=T)
#GlobalOverview<-as.data.frame(gsheet,stringsAsFactors=F)
#save(GlobalOverview,file="../../misc/data/gsheet.GlobalOverview.21-09-22.Robj")
load("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/1_code/r_scripts/gsheet.GlobalOverview.21-09-22.Robj")
Create a dataframe where the first row is the GAGA id (e.g. GAGA-0002), which are also used as the tip labels in the phylogeny
data2add<-data.frame(GAGA.id = GlobalOverview$`GAGA ID`, GlobalOverview)
Create a column with clean species IDs for all species.
This has to be done, because the database of all the sequenced species is a bit convoluted.
data2add$cleanSpeciesID<-dplyr::coalesce(data2add$Species,data2add$Species..from.Assembly.Overview.)
Load the tree file of all the GAGA genomes.
tree <- read.iqtree("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/1_code/r_scripts/hymenoptera_psc98_partition_b1000_40threads_msubnucl.treefile")
Create a very quick and dirty plot of the phylogeny
plot(tree@phylo,show.node.label = T,show.tip.label = T,cex=.5)
nodelabels(bg=rgb(0,0,0,0),col="red",frame = "none")
check the plot to see where the root should be. Here we root the tree at node 166 that separates the outgroups, the honeybee “Amel” (Apis mellifera) and the parasitoid wasp Nasonia (Nvit), from ants.
#tree.rooted<-treeio::root(tree,node=166)
tree.rooted <- tree
tree.rooted.subset<-treeio::drop.tip(tree.rooted,tip=c("NCBI-0018_Amel","NCBI-0019_Nvit"))
Plot basic tree
tp<-ggtree(tree.rooted.subset,ladderize = T, layout = "circular")+geom_rootedge(.01)
Add the species identity from FAS.genes.and.names.tsv
tp$data$label[tp$data$isTip==T]<-gsub("_.*","",tp$data$label[tp$data$isTip==T])
tp2<-tp %<+% data2add
tp2
#install.packages("ggnewscale")
library("ggnewscale")
#Make Subfamily as a factor (discrete color scale for factors)
tp3$data$Subfamily<-as.factor(tp3$data$Subfamily)
tp3<-tp2 +
geom_tiplab2(aes(label=paste(label,cleanSpeciesID,sep=" "), color = Subfamily),size=2,align = T,linesize = .1)+ #add the tip labels
geom_nodepoint(aes(fill=UFboot),stroke=0.5,pch=21,size=2)+ #add the info about bootstrapping success as points of different colors to the nodes
scale_fill_viridis_c()+ #change the color of the UFboot node points to a viridis color scale
theme(legend.position = "left",
legend.title=element_text(size=9), # the title size of legend.
legend.text=element_text(size=7)
)
tp3 #plot the tree
ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/Subfamilies.pdf",
width = 15,height=30) #save the tree in a pdf
Gather subfamily info for NCBI IDs as well
tp3$data[is.na(tp3$data$Subfamily) & tp3$data$isTip==T,]
#Add the raw LGT dataframe
library(readxl)
LGTs.raw<-read_excel("/Users/Janina/Documents/MASTER/Masterarbeit/lgt/0_data/GAGA.LGT.162.resulting.good.candidates.xlsx")
## New names:
## * `` -> ...27
## * `` -> ...28
## * `` -> ...29
LGTs.raw$...27 <- NULL
LGTs.raw$...28 <- NULL
LGTs.raw$...29 <- NULL
LGTs.raw$comment <- NULL
#Clean up the LGT dataframe to remove the database origin (GAGA-0001.euk -> GAGA-0001)
LGTs<-LGTs.raw
LGTs$GAGAid<-gsub("\\..*","",LGTs$.id)
Plot the circular phylogeny and add red dot to the tip, where the size of the red dot represents the number of identified LGTs.
#Create a tally dataframe that counts how many LGT candidates we have per species.
LGT.tally<-as.data.frame(table(select(LGTs,GAGAid)))
colnames(LGT.tally)<-c("GAGAid","LGT.count")
#Add the LGT.tally dataframe to the phylogeny object as additional data.
tp4<-tp3 %<+% LGT.tally
#see tp4$data$LGT.count
#Plot the phylogeny and add red dot to the tip, where the size of the red dot represents the number of identified LGTs
tp5<-tp4+geom_tippoint(aes(size=LGT.count),col="red", alpha=.5)
tp5
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).
ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.LGTs.pdf",width = 12,height=24) #save the tree in a pdf
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).
Create a dataframe that counts the genomes with ankyrin repeats
#Only select the ankyrins to show them as a heatmap
LGT.ankyrins<-subset(LGTs,predicted_protein=="ankyrin repeat protein") %>% group_by(GAGAid) %>% tally()
colnames(LGT.ankyrins)<-c("GAGAid","protein")
LGT.ankyrins$type<-"ankyrin repeat protein"
knitr::kable(head(LGT.ankyrins))
| GAGAid | protein | type |
|---|---|---|
| GAGA-0020 | 9 | ankyrin repeat protein |
| GAGA-0024 | 7 | ankyrin repeat protein |
| GAGA-0028 | 27 | ankyrin repeat protein |
| GAGA-0063 | 2 | ankyrin repeat protein |
| GAGA-0087 | 31 | ankyrin repeat protein |
| GAGA-0098 | 2 | ankyrin repeat protein |
#Only select the ankyrins and lysozymes to show them on the phylogeny
LGT.anks.lys<-subset(LGTs,predicted_protein %in% c("ankyrin repeat protein", "Lysozyme")) %>% group_by(GAGAid)
LGT.anks.lys<-LGT.anks.lys[,c(24,26)]
LGT.anks.lys <- LGT.anks.lys %>% count(GAGAid,predicted_protein, sort = T)
colnames(LGT.anks.lys)<-c("GAGAid","protein","count")
knitr::kable(head(LGT.anks.lys))
| GAGAid | protein | count |
|---|---|---|
| GAGA-0087 | ankyrin repeat protein | 31 |
| GAGA-0028 | ankyrin repeat protein | 27 |
| GAGA-0401 | ankyrin repeat protein | 16 |
| GAGA-0020 | ankyrin repeat protein | 9 |
| GAGA-0364 | ankyrin repeat protein | 8 |
| GAGA-0024 | ankyrin repeat protein | 7 |
Plot the phylogeny and add ankyrin repeat proteins and lysozymes as a heatmap layer
library(ggtree)
library(ggtreeExtra)
library(ggplot2)
library(ggnewscale)
library(dplyr)
library(tidytree)
library(ggstar)
# For the bar layer outside of the tree
tp5.5 <- tp5 +
new_scale_fill() +
geom_fruit(
data=LGT.anks.lys,
geom=geom_bar,
mapping=aes(y=GAGAid, x=count,fill=protein), #The count will be mapped to x
pwidth=1, # Width of the external layer
stat="identity",
orientation="y", #The orientation of the axis
position=position_stackx(hexpand=.7), #Adjust the position of the geom_layer
axis.params =list(
axis="x", # add axis text of the layer.
vjust=0.2,
text.angle=0, # the text size of axis.
text.size=1.5,
hjust=1, # adjust the horizontal position of text of axis
nbreak=2, # number of lines within geom_layer,
),
grid.params=list(alpha=.3) # Parameter to adjust gridlines around the tree
) +
scale_fill_manual(
values=c("#339933","#dfac03"), #color of the external geom_layer
name="Protein", #name of the legend for the external geom_layer
guide=guide_legend(keywidth=0.9,keyheight=0.9,order=6)
) +
theme(legend.position=c(0.96, 0.5), # the position of legend.
legend.background=element_rect(fill=NA), # the background of legend.
legend.title=element_text(size=9), # the title size of legend.
legend.text=element_text(size=7), # the text size of legend.
legend.spacing.y = unit(0.02, "cm")
)
tp5.5
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).
ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.anks.lys.pdf",width=13,height=26) #save the tree in a pdf
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).
LGT.proteins<-LGTs[,c(24,26)]
LGT.proteins <- LGT.proteins %>% count(GAGAid,predicted_protein, sort = T)
colnames(LGT.proteins)<-c("GAGAid","protein","count")
LGT.proteins <- na.omit(LGT.proteins)
knitr::kable(head(LGT.proteins))
| GAGAid | protein | count |
|---|---|---|
| GAGA-0087 | ankyrin repeat protein | 31 |
| GAGA-0028 | ankyrin repeat protein | 27 |
| GAGA-0401 | ankyrin repeat protein | 16 |
| GAGA-0020 | ankyrin repeat protein | 9 |
| GAGA-0364 | ankyrin repeat protein | 8 |
| GAGA-0024 | ankyrin repeat protein | 7 |
library("viridis")
# For the bar layer outside of the tree
tp5.5 <- tp5 +
new_scale_fill() +
geom_fruit(
data=LGT.proteins,
geom=geom_bar,
mapping=aes(y=GAGAid, x=count,fill=protein), #The count will be mapped to x
pwidth=1, # Width of the external layer
stat="identity",
orientation="y", #The orientation of the axis
position=position_stackx(hexpand=.7) #Adjust the position of the geom_layer
# axis.params =list(
# axis="x", # add axis text of the layer.
# vjust=0.2,
# text.angle=0, # the text size of axis.
# text.size=1.5,
# hjust=1, # adjust the horizontal position of text of axis
# nbreak=2, # number of lines within geom_layer,
# ),
# grid.params=list(alpha=.5) # Parameter to adjust gridlines around the tree
) +
scale_fill_discrete( #color of the external geom_layer
name="Protein", #name of the legend for the external geom_layer
guide=guide_legend(keywidth=0.9,keyheight=0.9,order=6, ncol=1)
) +
theme(legend.position=c(0.1, 0.5), # the position of legend.
legend.background=element_rect(fill=NA), # the background of legend.
legend.title=element_text(size=9), # the title size of legend.
legend.text=element_text(size=7), # the text size of legend.
legend.spacing.y = unit(0.02, "cm")
)
tp5.5
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).
ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.proteins.pdf",width=13,height=26) #save the tree in a pdf
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).
LGT.bacteria<-LGTs[,c(20,26)]
LGT.bacteria <- LGT.bacteria %>% count(GAGAid,suggested_prokaryote_origin, sort = T)
colnames(LGT.bacteria)<-c("GAGAid","prokaryote_origin","count")
LGT.bacteria <- na.omit(LGT.bacteria)
LGT.bacteria.WB.BM <- subset(LGT.bacteria,prokaryote_origin %in% c("Wolbachia", "Blochmannia")) %>% group_by(GAGAid)
knitr::kable(head(LGT.bacteria.WB.BM))
| GAGAid | prokaryote_origin | count |
|---|---|---|
| GAGA-0028 | Wolbachia | 34 |
| GAGA-0087 | Wolbachia | 33 |
| GAGA-0401 | Wolbachia | 21 |
| GAGA-0222 | Wolbachia | 11 |
| GAGA-0223 | Wolbachia | 11 |
| GAGA-0020 | Wolbachia | 10 |
Add the Wolbachia and Blochmannia origin
tp6 <- tp5.5 +
new_scale_fill() +
geom_fruit(
data=LGT.bacteria.WB.BM,
geom=geom_bar,
mapping=aes(y=GAGAid, x=count,fill=prokaryote_origin), #The count will be mapped to x
pwidth=1, # Width of the external layer
stat="identity",
orientation="y", #The orientation of the axis
#position=position_stackx(hexpand=.7)#, #Adjust the position of the geom_layer
#axis.params =list(
#axis="x", # add axis text of the layer.
# vjust=0.2,
#text.angle=0, # the text size of axis.
#text.size=1.5,
# hjust=1, # adjust the horizontal position of text of axis
# nbreak=2, # number of lines within geom_layer,
# ),
# grid.params=list(alpha=.4) # Parameter to adjust gridlines around the tree
) +
scale_fill_manual(
values=c("black","grey"), #color of the external geom_layer
name="Prokaryote origin", #name of the legend for the external geom_layer
guide=guide_legend(keywidth=0.9,keyheight=0.9,order=6, ncol=1)
) +
theme(legend.position=c(0.07, 0.5), # the position of legend.
legend.background=element_rect(fill=NA), # the background of legend.
legend.title=element_text(size=9), # the title size of legend.
legend.text=element_text(size=7), # the text size of legend.
legend.spacing.y = unit(0.02, "cm")
)
tp6
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).
ggsave("/Users/Janina/Documents/MASTER/Masterarbeit/plots/GAGA_phylogenies/GAGA.phylogeny.WB.BM.pdf",width=15,height=30) #save the tree in a pdf
## Warning: Removed 77 rows containing missing values (geom_point_g_gtree).